library(dplyr)
library(forcats)
library(vtable)
library(ggplot2)
library(plotly) # OptionalProject Part 1
Group 9
Loading libraries
Loading dataset
dataset <- read.csv("SMARTc.csv", sep = ";") # Without missing valuesRe-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Description of the dataset and table of variables
The dataset is about cardiovascular health. It contains two outcomes : EVENT and TEVENT, the presence of cardiovascular events and the number of days the patient is in study until the event occurs. The dataset contains many variables, some of them are categorical and some of them are numerical. It covers patient descriptives, classical risk factors, previous symptomatic atherosclerosis, and markers of atherosclerosis.
sumtable(dataset, out = "return", add.median = TRUE)Association between variables and the outcome
avg_event_proportion <- mean(as.numeric(dataset$EVENT == "yes"))
SMOKING <- dataset$SMOKING
EVENT <- dataset$EVENT
bar_plot <- ggplot(dataset, aes(x = SMOKING, fill = EVENT)) +
geom_bar(position = "fill") + # "fill" scales the bars to proportion
geom_hline(yintercept = avg_event_proportion, linetype = "dashed") +
labs(
title = "Cardiovascular Event by Smoking Status",
x = "Smoking Status", y = "Proportion (-- : Average)",
fill = "Cardiovascular Event"
)
ggplotly(bar_plot,width = 500, height = 600)